{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import MinMaxScaler,StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
    "from sklearn.preprocessing import Normalizer\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>left</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.96</td>\n",
       "      <td>6</td>\n",
       "      <td>280</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>999999.00</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sale</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15001</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.40</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sale</td>\n",
       "      <td>nme</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15002 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0                    0.38             0.53               2   \n",
       "1                    0.80             0.86               5   \n",
       "2                    0.11             0.88               7   \n",
       "3                    0.72             0.87               5   \n",
       "4                    0.37             0.52               2   \n",
       "...                   ...              ...             ...   \n",
       "14997                0.11             0.96               6   \n",
       "14998                0.37             0.52               2   \n",
       "14999                 NaN             0.52               2   \n",
       "15000                 NaN        999999.00               2   \n",
       "15001                0.70             0.40               2   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  left  \\\n",
       "0                        157                   3              0     1   \n",
       "1                        262                   6              0     1   \n",
       "2                        272                   4              0     1   \n",
       "3                        223                   5              0     1   \n",
       "4                        159                   3              0     1   \n",
       "...                      ...                 ...            ...   ...   \n",
       "14997                    280                   4              0     1   \n",
       "14998                    158                   3              0     1   \n",
       "14999                    158                   3              0     1   \n",
       "15000                    158                   3              0     1   \n",
       "15001                    158                   2              0     1   \n",
       "\n",
       "       promotion_last_5years department  salary  \n",
       "0                          0      sales     low  \n",
       "1                          0      sales  medium  \n",
       "2                          0      sales  medium  \n",
       "3                          0      sales     low  \n",
       "4                          0      sales     low  \n",
       "...                      ...        ...     ...  \n",
       "14997                      0    support     low  \n",
       "14998                      0    support     low  \n",
       "14999                      0    support     low  \n",
       "15000                      0       sale     low  \n",
       "15001                      0       sale     nme  \n",
       "\n",
       "[15002 rows x 10 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读数据\n",
    "df = pd.read_csv('./data/HR.csv')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>left</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14994</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.57</td>\n",
       "      <td>2</td>\n",
       "      <td>151</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14995</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.48</td>\n",
       "      <td>2</td>\n",
       "      <td>160</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14996</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>143</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.96</td>\n",
       "      <td>6</td>\n",
       "      <td>280</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14999 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0                    0.38             0.53               2   \n",
       "1                    0.80             0.86               5   \n",
       "2                    0.11             0.88               7   \n",
       "3                    0.72             0.87               5   \n",
       "4                    0.37             0.52               2   \n",
       "...                   ...              ...             ...   \n",
       "14994                0.40             0.57               2   \n",
       "14995                0.37             0.48               2   \n",
       "14996                0.37             0.53               2   \n",
       "14997                0.11             0.96               6   \n",
       "14998                0.37             0.52               2   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  left  \\\n",
       "0                        157                   3              0     1   \n",
       "1                        262                   6              0     1   \n",
       "2                        272                   4              0     1   \n",
       "3                        223                   5              0     1   \n",
       "4                        159                   3              0     1   \n",
       "...                      ...                 ...            ...   ...   \n",
       "14994                    151                   3              0     1   \n",
       "14995                    160                   3              0     1   \n",
       "14996                    143                   3              0     1   \n",
       "14997                    280                   4              0     1   \n",
       "14998                    158                   3              0     1   \n",
       "\n",
       "       promotion_last_5years department  salary  \n",
       "0                          0      sales     low  \n",
       "1                          0      sales  medium  \n",
       "2                          0      sales  medium  \n",
       "3                          0      sales     low  \n",
       "4                          0      sales     low  \n",
       "...                      ...        ...     ...  \n",
       "14994                      0    support     low  \n",
       "14995                      0    support     low  \n",
       "14996                      0    support     low  \n",
       "14997                      0    support     low  \n",
       "14998                      0    support     low  \n",
       "\n",
       "[14999 rows x 10 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1.清洗数据\n",
    "df = df.dropna(subset=['satisfaction_level'])\n",
    "df = df[df['last_evaluation']<=1][df['salary']!='nme']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0        1\n",
       "1        1\n",
       "2        1\n",
       "3        1\n",
       "4        1\n",
       "        ..\n",
       "14994    1\n",
       "14995    1\n",
       "14996    1\n",
       "14997    1\n",
       "14998    1\n",
       "Name: left, Length: 14999, dtype: int64"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2.得到标注\n",
    "label = df['left']\n",
    "label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14994</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.57</td>\n",
       "      <td>2</td>\n",
       "      <td>151</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14995</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.48</td>\n",
       "      <td>2</td>\n",
       "      <td>160</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14996</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>143</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.96</td>\n",
       "      <td>6</td>\n",
       "      <td>280</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14999 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0                    0.38             0.53               2   \n",
       "1                    0.80             0.86               5   \n",
       "2                    0.11             0.88               7   \n",
       "3                    0.72             0.87               5   \n",
       "4                    0.37             0.52               2   \n",
       "...                   ...              ...             ...   \n",
       "14994                0.40             0.57               2   \n",
       "14995                0.37             0.48               2   \n",
       "14996                0.37             0.53               2   \n",
       "14997                0.11             0.96               6   \n",
       "14998                0.37             0.52               2   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  \\\n",
       "0                        157                   3              0   \n",
       "1                        262                   6              0   \n",
       "2                        272                   4              0   \n",
       "3                        223                   5              0   \n",
       "4                        159                   3              0   \n",
       "...                      ...                 ...            ...   \n",
       "14994                    151                   3              0   \n",
       "14995                    160                   3              0   \n",
       "14996                    143                   3              0   \n",
       "14997                    280                   4              0   \n",
       "14998                    158                   3              0   \n",
       "\n",
       "       promotion_last_5years department  salary  \n",
       "0                          0      sales     low  \n",
       "1                          0      sales  medium  \n",
       "2                          0      sales  medium  \n",
       "3                          0      sales     low  \n",
       "4                          0      sales     low  \n",
       "...                      ...        ...     ...  \n",
       "14994                      0    support     low  \n",
       "14995                      0    support     low  \n",
       "14996                      0    support     low  \n",
       "14997                      0    support     low  \n",
       "14998                      0    support     low  \n",
       "\n",
       "[14999 rows x 9 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.drop('left',axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>157</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.86</td>\n",
       "      <td>5</td>\n",
       "      <td>262</td>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.88</td>\n",
       "      <td>7</td>\n",
       "      <td>272</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.87</td>\n",
       "      <td>5</td>\n",
       "      <td>223</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>159</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14994</td>\n",
       "      <td>0.40</td>\n",
       "      <td>0.57</td>\n",
       "      <td>2</td>\n",
       "      <td>151</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14995</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.48</td>\n",
       "      <td>2</td>\n",
       "      <td>160</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14996</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.53</td>\n",
       "      <td>2</td>\n",
       "      <td>143</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.96</td>\n",
       "      <td>6</td>\n",
       "      <td>280</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>0.37</td>\n",
       "      <td>0.52</td>\n",
       "      <td>2</td>\n",
       "      <td>158</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14999 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0                    0.38             0.53               2   \n",
       "1                    0.80             0.86               5   \n",
       "2                    0.11             0.88               7   \n",
       "3                    0.72             0.87               5   \n",
       "4                    0.37             0.52               2   \n",
       "...                   ...              ...             ...   \n",
       "14994                0.40             0.57               2   \n",
       "14995                0.37             0.48               2   \n",
       "14996                0.37             0.53               2   \n",
       "14997                0.11             0.96               6   \n",
       "14998                0.37             0.52               2   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  \\\n",
       "0                        157                   3              0   \n",
       "1                        262                   6              0   \n",
       "2                        272                   4              0   \n",
       "3                        223                   5              0   \n",
       "4                        159                   3              0   \n",
       "...                      ...                 ...            ...   \n",
       "14994                    151                   3              0   \n",
       "14995                    160                   3              0   \n",
       "14996                    143                   3              0   \n",
       "14997                    280                   4              0   \n",
       "14998                    158                   3              0   \n",
       "\n",
       "       promotion_last_5years department  salary  \n",
       "0                          0      sales     low  \n",
       "1                          0      sales  medium  \n",
       "2                          0      sales  medium  \n",
       "3                          0      sales     low  \n",
       "4                          0      sales     low  \n",
       "...                      ...        ...     ...  \n",
       "14994                      0    support     low  \n",
       "14995                      0    support     low  \n",
       "14996                      0    support     low  \n",
       "14997                      0    support     low  \n",
       "14998                      0    support     low  \n",
       "\n",
       "[14999 rows x 9 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3.特征选择，特征较少，先不删除特征\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>-0.936495</td>\n",
       "      <td>-1.087275</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.285047</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.752814</td>\n",
       "      <td>0.840707</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.775701</td>\n",
       "      <td>0.500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>-2.022479</td>\n",
       "      <td>0.957554</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.822430</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>sales</td>\n",
       "      <td>medium</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.431041</td>\n",
       "      <td>0.899131</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.593458</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.145699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.294393</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>sales</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14994</td>\n",
       "      <td>-0.856051</td>\n",
       "      <td>-0.853580</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.257009</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14995</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.379394</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.299065</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14996</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.087275</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.219626</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>-2.022479</td>\n",
       "      <td>1.424944</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.859813</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.145699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.289720</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>support</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14999 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0               -0.936495        -1.087275             0.0   \n",
       "1                0.752814         0.840707             0.6   \n",
       "2               -2.022479         0.957554             1.0   \n",
       "3                0.431041         0.899131             0.6   \n",
       "4               -0.976716        -1.145699             0.0   \n",
       "...                   ...              ...             ...   \n",
       "14994           -0.856051        -0.853580             0.0   \n",
       "14995           -0.976716        -1.379394             0.0   \n",
       "14996           -0.976716        -1.087275             0.0   \n",
       "14997           -2.022479         1.424944             0.8   \n",
       "14998           -0.976716        -1.145699             0.0   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  \\\n",
       "0                   0.285047               0.125            0.0   \n",
       "1                   0.775701               0.500            0.0   \n",
       "2                   0.822430               0.250            0.0   \n",
       "3                   0.593458               0.375            0.0   \n",
       "4                   0.294393               0.125            0.0   \n",
       "...                      ...                 ...            ...   \n",
       "14994               0.257009               0.125            0.0   \n",
       "14995               0.299065               0.125            0.0   \n",
       "14996               0.219626               0.125            0.0   \n",
       "14997               0.859813               0.250            0.0   \n",
       "14998               0.289720               0.125            0.0   \n",
       "\n",
       "       promotion_last_5years department  salary  \n",
       "0                        0.0      sales     low  \n",
       "1                        0.0      sales  medium  \n",
       "2                        0.0      sales  medium  \n",
       "3                        0.0      sales     low  \n",
       "4                        0.0      sales     low  \n",
       "...                      ...        ...     ...  \n",
       "14994                    0.0    support     low  \n",
       "14995                    0.0    support     low  \n",
       "14996                    0.0    support     low  \n",
       "14997                    0.0    support     low  \n",
       "14998                    0.0    support     low  \n",
       "\n",
       "[14999 rows x 9 columns]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4.特征处理\n",
    "# 连续属性\n",
    "# sl表示satisfaction_level--False：MinMaxScaler；True：StandardScaler\n",
    "# le表示last_evaluation--False：MinMaxScaler；True：StandardScaler\n",
    "# npr表示number_project--False：MinMaxScaler；True：StandardScaler\n",
    "# amh表示average_monthly_hours--False：MinMaxScaler；True：StandardScaler\n",
    "# tsc表示time_spend_company--False：MinMaxScaler；True：StandardScaler\n",
    "# wa表示Work_accident--False：MinMaxScaler；True：StandardScaler\n",
    "# pl5表示promotion_last_5years--False：MinMaxScaler；True：StandardScaler\n",
    "def preprocess1(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False):\n",
    "    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]\n",
    "    column_lst = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',\n",
    "                 'time_spend_company','Work_accident','promotion_last_5years']\n",
    "    for i in range(len(scaler_lst)):\n",
    "        if not scaler_lst[i]:\n",
    "            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))\n",
    "        else:\n",
    "            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))\n",
    "\n",
    "preprocess1(sl=True,le=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>-0.936495</td>\n",
       "      <td>-1.087275</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.285047</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.752814</td>\n",
       "      <td>0.840707</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.775701</td>\n",
       "      <td>0.500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>-2.022479</td>\n",
       "      <td>0.957554</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.822430</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.431041</td>\n",
       "      <td>0.899131</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.593458</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.145699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.294393</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14994</td>\n",
       "      <td>-0.856051</td>\n",
       "      <td>-0.853580</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.257009</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14995</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.379394</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.299065</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14996</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.087275</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.219626</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>-2.022479</td>\n",
       "      <td>1.424944</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.859813</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.145699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.289720</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14999 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0               -0.936495        -1.087275             0.0   \n",
       "1                0.752814         0.840707             0.6   \n",
       "2               -2.022479         0.957554             1.0   \n",
       "3                0.431041         0.899131             0.6   \n",
       "4               -0.976716        -1.145699             0.0   \n",
       "...                   ...              ...             ...   \n",
       "14994           -0.856051        -0.853580             0.0   \n",
       "14995           -0.976716        -1.379394             0.0   \n",
       "14996           -0.976716        -1.087275             0.0   \n",
       "14997           -2.022479         1.424944             0.8   \n",
       "14998           -0.976716        -1.145699             0.0   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  \\\n",
       "0                   0.285047               0.125            0.0   \n",
       "1                   0.775701               0.500            0.0   \n",
       "2                   0.822430               0.250            0.0   \n",
       "3                   0.593458               0.375            0.0   \n",
       "4                   0.294393               0.125            0.0   \n",
       "...                      ...                 ...            ...   \n",
       "14994               0.257009               0.125            0.0   \n",
       "14995               0.299065               0.125            0.0   \n",
       "14996               0.219626               0.125            0.0   \n",
       "14997               0.859813               0.250            0.0   \n",
       "14998               0.289720               0.125            0.0   \n",
       "\n",
       "       promotion_last_5years  department  salary  \n",
       "0                        0.0    0.777778     0.0  \n",
       "1                        0.0    0.777778     0.5  \n",
       "2                        0.0    0.777778     0.5  \n",
       "3                        0.0    0.777778     0.0  \n",
       "4                        0.0    0.777778     0.0  \n",
       "...                      ...         ...     ...  \n",
       "14994                    0.0    0.888889     0.0  \n",
       "14995                    0.0    0.888889     0.0  \n",
       "14996                    0.0    0.888889     0.0  \n",
       "14997                    0.0    0.888889     0.0  \n",
       "14998                    0.0    0.888889     0.0  \n",
       "\n",
       "[14999 rows x 9 columns]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 离散属性\n",
    "# slr表示salary--False：LabelEncoder；True：OneHotEncoder\n",
    "# dp表示department--False：LabelEncoder；True：OneHotEncoder\n",
    "def preprocess2(slr=False,dp=False):\n",
    "    global df\n",
    "    scaler_lst = [slr,dp]\n",
    "    column_lst = ['salary','department']\n",
    "    for i in range(len(scaler_lst)):\n",
    "        if not scaler_lst[i]:\n",
    "            if column_lst[i] == 'salary':\n",
    "                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]\n",
    "            else:\n",
    "                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])\n",
    "            # 归一化处理\n",
    "            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1))\n",
    "        else:\n",
    "            df = pd.get_dummies(df,columns=[column_lst[i]])\n",
    "\n",
    "# 数值化重写map函数，把salary对应到我们想要的数值\n",
    "def map_salary(s):\n",
    "    d = dict([('low',0),('medium',1),('high',2)])\n",
    "    return d.get(s,0)\n",
    "\n",
    "preprocess2()\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>satisfaction_level</th>\n",
       "      <th>last_evaluation</th>\n",
       "      <th>number_project</th>\n",
       "      <th>average_monthly_hours</th>\n",
       "      <th>time_spend_company</th>\n",
       "      <th>Work_accident</th>\n",
       "      <th>promotion_last_5years</th>\n",
       "      <th>department</th>\n",
       "      <th>salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>-0.936495</td>\n",
       "      <td>-1.087275</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.285047</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.752814</td>\n",
       "      <td>0.840707</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.775701</td>\n",
       "      <td>0.500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>-2.022479</td>\n",
       "      <td>0.957554</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.822430</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.431041</td>\n",
       "      <td>0.899131</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.593458</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.145699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.294393</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14994</td>\n",
       "      <td>-0.856051</td>\n",
       "      <td>-0.853580</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.257009</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14995</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.379394</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.299065</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14996</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.087275</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.219626</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14997</td>\n",
       "      <td>-2.022479</td>\n",
       "      <td>1.424944</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.859813</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14998</td>\n",
       "      <td>-0.976716</td>\n",
       "      <td>-1.145699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.289720</td>\n",
       "      <td>0.125</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14999 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       satisfaction_level  last_evaluation  number_project  \\\n",
       "0               -0.936495        -1.087275             0.0   \n",
       "1                0.752814         0.840707             0.6   \n",
       "2               -2.022479         0.957554             1.0   \n",
       "3                0.431041         0.899131             0.6   \n",
       "4               -0.976716        -1.145699             0.0   \n",
       "...                   ...              ...             ...   \n",
       "14994           -0.856051        -0.853580             0.0   \n",
       "14995           -0.976716        -1.379394             0.0   \n",
       "14996           -0.976716        -1.087275             0.0   \n",
       "14997           -2.022479         1.424944             0.8   \n",
       "14998           -0.976716        -1.145699             0.0   \n",
       "\n",
       "       average_monthly_hours  time_spend_company  Work_accident  \\\n",
       "0                   0.285047               0.125            0.0   \n",
       "1                   0.775701               0.500            0.0   \n",
       "2                   0.822430               0.250            0.0   \n",
       "3                   0.593458               0.375            0.0   \n",
       "4                   0.294393               0.125            0.0   \n",
       "...                      ...                 ...            ...   \n",
       "14994               0.257009               0.125            0.0   \n",
       "14995               0.299065               0.125            0.0   \n",
       "14996               0.219626               0.125            0.0   \n",
       "14997               0.859813               0.250            0.0   \n",
       "14998               0.289720               0.125            0.0   \n",
       "\n",
       "       promotion_last_5years  department  salary  \n",
       "0                        0.0    0.777778     0.0  \n",
       "1                        0.0    0.777778     0.5  \n",
       "2                        0.0    0.777778     0.5  \n",
       "3                        0.0    0.777778     0.0  \n",
       "4                        0.0    0.777778     0.0  \n",
       "...                      ...         ...     ...  \n",
       "14994                    0.0    0.888889     0.0  \n",
       "14995                    0.0    0.888889     0.0  \n",
       "14996                    0.0    0.888889     0.0  \n",
       "14997                    0.0    0.888889     0.0  \n",
       "14998                    0.0    0.888889     0.0  \n",
       "\n",
       "[14999 rows x 9 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 5.降维\n",
    "def preprocess3(lower_d=False,ld_n=1):\n",
    "    if lower_d:\n",
    "        # 因为标注只有两类，LDA降维只剩1类，所以不使用LDA，使用PCA\n",
    "        # return LinearDiscriminantAnalysis(n_components=ld_n)\n",
    "        return PCA(n_components=ld_n).fit_transform(df.values)\n",
    "\n",
    "preprocess3()\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 总的预处理函数\n",
    "def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,slr=False,dp=False,lower_d=False,ld_n=1):\n",
    "    # 读数据\n",
    "    df = pd.read_csv('./data/HR.csv')\n",
    "    # 1.清洗数据\n",
    "    df = df.dropna(subset=['satisfaction_level'])\n",
    "    df = df[df['last_evaluation']<=1][df['salary']!='nme']\n",
    "    # 2.得到标注\n",
    "    label = df['left']\n",
    "    df = df.drop('left',axis=1)\n",
    "    # 3.特征选择，特征较少，先不删除特征\n",
    "    # 4.特征处理\n",
    "    # 连续属性\n",
    "    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]\n",
    "    column_lst = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',\n",
    "                 'time_spend_company','Work_accident','promotion_last_5years']\n",
    "    for i in range(len(scaler_lst)):\n",
    "        if not scaler_lst[i]:\n",
    "            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))\n",
    "        else:\n",
    "            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1))\n",
    "    # 离散属性\n",
    "    # 数值化重写map函数，把salary对应到我们想要的数值\n",
    "    def map_salary(s):\n",
    "        d = dict([('low',0),('medium',1),('high',2)])\n",
    "        return d.get(s,0)\n",
    "    scaler_lst = [slr,dp]\n",
    "    column_lst = ['salary','department']\n",
    "    for i in range(len(scaler_lst)):\n",
    "        if not scaler_lst[i]:\n",
    "            if column_lst[i] == 'salary':\n",
    "                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]\n",
    "            else:\n",
    "                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])\n",
    "            # 归一化处理\n",
    "            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1))\n",
    "        else:\n",
    "            df = pd.get_dummies(df,columns=[column_lst[i]])\n",
    "    # 5.降维\n",
    "    if lower_d:\n",
    "        # 因为标注只有两类，LDA降维只剩1类，所以不使用LDA，使用PCA\n",
    "        # return LinearDiscriminantAnalysis(n_components=ld_n)\n",
    "        return PCA(n_components=ld_n).fit_transform(df.values),label\n",
    "    return df,label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
